# Copyright (c) 2021 changwei@iscas.ac.cn
# 
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# 
# 1. Redistributions of source code must retain the above copyright notice, this list of
#    conditions and the following disclaimer.
# 
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
#    of conditions and the following disclaimer in the documentation and/or other materials
#    provided with the distribution.
# 
# 3. Neither the name of the copyright holder nor the names of its contributors may be used
#    to endorse or promote products derived from this software without specific prior written
#    permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
import codecs
import copy
import json
import os
import re
import string

TITLE_LINK_REGEX_PATTERN = re.compile('\[(.*?)\]\((.*)\)')
TITLE_ONLY_REGEX_PATTERN = re.compile('\[(.*?)\]')


def parse_line(line: str):
    if '——>' in line:
        depth = line.count('——>')
        line = line.replace('——>', '')
    elif '#' in line:
        depth = line.count('#')
        line = line.replace('#', '')
    else:
        assert False
    match = re.findall(TITLE_LINK_REGEX_PATTERN, line)
    if match:
        path = match[0][1]
        match = match[0][0]
        match = filter_title(match)
        # print(match)  # DEBUG
        return {'depth': depth, 'title': match, 'md_path': path}
    title_match = re.findall(TITLE_ONLY_REGEX_PATTERN, line)
    if title_match:
        line = title_match[0]
    return {'depth': depth, 'title': line}


def filter_title(title: str):
    title = title.replace(' ', '-')  # ' '
    title = title.replace('\\', '')  # '\'
    title = title.replace('"', '')  # "'"
    return title


def plant_tree(lines_info: list) -> list:
    current_depth = 1
    current_list = []
    depth_stack = []
    if len(lines_info) == 0:
        return
    index = 0
    while index < len(lines_info):
        line_info = lines_info[index]
        print(line_info, '; depth =', current_depth)  # DEBUG
        if line_info['depth'] > current_depth:
            assert line_info['depth'] - 1 == current_depth
            depth_stack.append(current_list)
            current_json = copy.deepcopy(line_info)
            current_list = [current_json]
            current_depth += 1
            index += 1
        elif line_info['depth'] == current_depth:
            current_json = copy.deepcopy(line_info)
            current_list.append(current_json)
            index += 1
        else:
            child_list = current_list
            current_list = depth_stack.pop()
            current_list[-1]['children'] = child_list
            current_depth -= 1
            continue

    while depth_stack:
        child_list = current_list
        current_list = depth_stack.pop()
        current_list[-1]['children'] = child_list

    return current_list


def check_file_list_existence(input_file_list: str, input_doc_root: str,
                              output_file_list: str):
    not_found_list = []

    def _check_file_existence(json_list: list):
        for json_dict in json_list:
            if 'children' in json_dict:
                _check_file_existence(json_dict['children'])
            elif 'md_path' in json_dict:
                file_abs_path = os.path.join(input_doc_root,
                                             json_dict['md_path'])
                if not os.path.exists(file_abs_path):
                    print(json_dict['title'], 'md 文件不存在!')
                    not_found_list.append(json_dict['md_path'])
            else:
                print(json_dict['title'], '没有 md 文件路径!')
                not_found_list.append(f'{json_dict["title"]} ：没有 md 文件路径')

    with codecs.open(input_file_list, 'r', encoding='utf-8') as inFile:
        file_list = json.load(inFile)

    _check_file_existence(file_list)

    with codecs.open(output_file_list, 'w', encoding='utf-8') as outputFile:
        for not_found_file in not_found_list:
            outputFile.write(not_found_file)
            outputFile.write('\n')

    assert len(not_found_list) == 0, '目录存在无效链接！'


def parse_doc_tree_main(input_filename, output_filename):
    LINES_INFO = []
    ''' STEP 1a: 读取 md 文档树，记录每一行的信息 '''
    with codecs.open(input_filename, 'r', encoding='utf-8') as inputFile:
        for line in inputFile:
            line = line.translate({ord(c): None for c in string.whitespace})
            line = line.replace('@', '')
            if not line:
                continue
            line_info = parse_line(line)
            LINES_INFO.append(line_info)
            # print(line_info)  # DEBUG
    ''' STEP 1b: 以深度为单位统计信息，使用 stack 数据结构 '''
    output_list = plant_tree(LINES_INFO)

    with codecs.open(output_filename, 'w', encoding='utf-8') as outputFile:
        json.dump(output_list, outputFile, indent=2, ensure_ascii=False)


if __name__ == '__main__':
    pass
